Importing Libraries¶

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Importing Datasets¶

In [23]:
# Load the dataset
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df.head()
Out[23]:
Category pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc handles.nhandles handles.avg_handles_per_proc ... svcscan.kernel_drivers svcscan.fs_drivers svcscan.process_services svcscan.shared_process_services svcscan.interactive_process_services svcscan.nactive callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric Class
0 Benign 45 17 10.555556 0 202.844444 1694 38.500000 9129 212.302326 ... 221 26 24 116 0 121 87 0 8 Benign
1 Benign 47 19 11.531915 0 242.234043 2074 44.127660 11385 242.234043 ... 222 26 24 118 0 122 87 0 8 Benign
2 Benign 40 14 14.725000 0 288.225000 1932 48.300000 11529 288.225000 ... 222 26 27 118 0 120 88 0 8 Benign
3 Benign 32 13 13.500000 0 264.281250 1445 45.156250 8457 264.281250 ... 222 26 27 118 0 120 88 0 8 Benign
4 Benign 42 16 11.452381 0 281.333333 2067 49.214286 11816 281.333333 ... 222 26 24 118 0 124 87 0 8 Benign

5 rows × 57 columns

Data Description¶

In [24]:
# Display the columns
print(df.columns)

# Show summary statistics
print(df.describe())
Index(['Category', 'pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
       'pslist.nprocs64bit', 'pslist.avg_handlers', 'dlllist.ndlls',
       'dlllist.avg_dlls_per_proc', 'handles.nhandles',
       'handles.avg_handles_per_proc', 'handles.nport', 'handles.nfile',
       'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
       'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
       'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
       'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
       'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
       'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
       'malfind.commitCharge', 'malfind.protection',
       'malfind.uniqueInjections', 'psxview.not_in_pslist',
       'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
       'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
       'psxview.not_in_session', 'psxview.not_in_deskthrd',
       'psxview.not_in_pslist_false_avg',
       'psxview.not_in_eprocess_pool_false_avg',
       'psxview.not_in_ethread_pool_false_avg',
       'psxview.not_in_pspcid_list_false_avg',
       'psxview.not_in_csrss_handles_false_avg',
       'psxview.not_in_session_false_avg', 'psxview.not_in_deskthrd_false_avg',
       'modules.nmodules', 'svcscan.nservices', 'svcscan.kernel_drivers',
       'svcscan.fs_drivers', 'svcscan.process_services',
       'svcscan.shared_process_services',
       'svcscan.interactive_process_services', 'svcscan.nactive',
       'callbacks.ncallbacks', 'callbacks.nanonymous', 'callbacks.ngeneric',
       'Class'],
      dtype='object')
       pslist.nproc  pslist.nppid  pslist.avg_threads  pslist.nprocs64bit  \
count  58596.000000  58596.000000        58596.000000             58596.0   
mean      41.394771     14.713837           11.341655                 0.0   
std        5.777249      2.656748            1.588231                 0.0   
min       21.000000      8.000000            1.650000                 0.0   
25%       40.000000     12.000000            9.972973                 0.0   
50%       41.000000     15.000000           11.000000                 0.0   
75%       43.000000     16.000000           12.861955                 0.0   
max      240.000000     72.000000           16.818182                 0.0   

       pslist.avg_handlers  dlllist.ndlls  dlllist.avg_dlls_per_proc  \
count         58596.000000   58596.000000               58596.000000   
mean            247.509819    1810.805447                  43.707806   
std             111.857790     329.782639                   5.742023   
min              34.962500     670.000000                   7.333333   
25%             208.725000    1556.000000                  38.833333   
50%             243.963710    1735.000000                  42.781524   
75%             289.974322    2087.000000                  49.605280   
max           24845.951220    3443.000000                  53.170732   

       handles.nhandles  handles.avg_handles_per_proc  handles.nport  ...  \
count      5.859600e+04                  58596.000000        58596.0  ...   
mean       1.025858e+04                    249.560958            0.0  ...   
std        4.866864e+03                    145.999866            0.0  ...   
min        3.514000e+03                     71.139241            0.0  ...   
25%        8.393000e+03                    209.648228            0.0  ...   
50%        9.287500e+03                    247.208951            0.0  ...   
75%        1.219300e+04                    291.355050            0.0  ...   
max        1.047310e+06                  33784.193550            0.0  ...   

       svcscan.nservices  svcscan.kernel_drivers  svcscan.fs_drivers  \
count       58596.000000            58596.000000        58596.000000   
mean          391.347549              221.406581           25.996245   
std             4.529704                1.991087            0.170790   
min            94.000000               55.000000            6.000000   
25%           389.000000              221.000000           26.000000   
50%           389.000000              221.000000           26.000000   
75%           395.000000              222.000000           26.000000   
max           395.000000              222.000000           26.000000   

       svcscan.process_services  svcscan.shared_process_services  \
count              58596.000000                     58596.000000   
mean                  25.063417                       116.879514   
std                    1.529628                         1.550401   
min                    7.000000                        26.000000   
25%                   24.000000                       116.000000   
50%                   24.000000                       116.000000   
75%                   27.000000                       118.000000   
max                   27.000000                       118.000000   

       svcscan.interactive_process_services  svcscan.nactive  \
count                               58596.0     58596.000000   
mean                                    0.0       121.995546   
std                                     0.0         2.822858   
min                                     0.0        30.000000   
25%                                     0.0       121.000000   
50%                                     0.0       122.000000   
75%                                     0.0       123.000000   
max                                     0.0       129.000000   

       callbacks.ncallbacks  callbacks.nanonymous  callbacks.ngeneric  
count          58596.000000          58596.000000        58596.000000  
mean              86.905659              0.000853            7.999881  
std                3.134117              0.029199            0.010929  
min               50.000000              0.000000            7.000000  
25%               87.000000              0.000000            8.000000  
50%               87.000000              0.000000            8.000000  
75%               88.000000              0.000000            8.000000  
max               89.000000              1.000000            8.000000  

[8 rows x 55 columns]

Density-KDE Plot¶

In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot or Density-KDE plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='pslist.nproc', y='pslist.avg_threads')
plt.xlabel('Number of Processes')
plt.ylabel('Average Threads')
plt.title('Scatter Plot: Number of Processes vs Average Threads')
plt.show()
No description has been provided for this image
In [8]:

Data Preprocessing¶

In [26]:
df.isna()
Out[26]:
Category pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc handles.nhandles handles.avg_handles_per_proc ... svcscan.kernel_drivers svcscan.fs_drivers svcscan.process_services svcscan.shared_process_services svcscan.interactive_process_services svcscan.nactive callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric Class
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58591 False False False False False False False False False False ... False False False False False False False False False False
58592 False False False False False False False False False False ... False False False False False False False False False False
58593 False False False False False False False False False False ... False False False False False False False False False False
58594 False False False False False False False False False False ... False False False False False False False False False False
58595 False False False False False False False False False False ... False False False False False False False False False False

58596 rows × 57 columns

In [27]:
import pandas as pd

#Removing NaN values
df.dropna(inplace=True)
df.head()
Out[27]:
Category pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc handles.nhandles handles.avg_handles_per_proc ... svcscan.kernel_drivers svcscan.fs_drivers svcscan.process_services svcscan.shared_process_services svcscan.interactive_process_services svcscan.nactive callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric Class
0 Benign 45 17 10.555556 0 202.844444 1694 38.500000 9129 212.302326 ... 221 26 24 116 0 121 87 0 8 Benign
1 Benign 47 19 11.531915 0 242.234043 2074 44.127660 11385 242.234043 ... 222 26 24 118 0 122 87 0 8 Benign
2 Benign 40 14 14.725000 0 288.225000 1932 48.300000 11529 288.225000 ... 222 26 27 118 0 120 88 0 8 Benign
3 Benign 32 13 13.500000 0 264.281250 1445 45.156250 8457 264.281250 ... 222 26 27 118 0 120 88 0 8 Benign
4 Benign 42 16 11.452381 0 281.333333 2067 49.214286 11816 281.333333 ... 222 26 24 118 0 124 87 0 8 Benign

5 rows × 57 columns

In [28]:
df.fillna(df.mean())
<ipython-input-28-a2478f315f9e>:1: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.fillna(df.mean())
Out[28]:
Category pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc handles.nhandles handles.avg_handles_per_proc ... svcscan.kernel_drivers svcscan.fs_drivers svcscan.process_services svcscan.shared_process_services svcscan.interactive_process_services svcscan.nactive callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric Class
0 Benign 45 17 10.555556 0 202.844444 1694 38.500000 9129 212.302326 ... 221 26 24 116 0 121 87 0 8 Benign
1 Benign 47 19 11.531915 0 242.234043 2074 44.127660 11385 242.234043 ... 222 26 24 118 0 122 87 0 8 Benign
2 Benign 40 14 14.725000 0 288.225000 1932 48.300000 11529 288.225000 ... 222 26 27 118 0 120 88 0 8 Benign
3 Benign 32 13 13.500000 0 264.281250 1445 45.156250 8457 264.281250 ... 222 26 27 118 0 120 88 0 8 Benign
4 Benign 42 16 11.452381 0 281.333333 2067 49.214286 11816 281.333333 ... 222 26 24 118 0 124 87 0 8 Benign
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
58591 Ransomware-Shade-fa03be3078d1b9840f06745f160eb... 37 15 10.108108 0 215.486487 1453 39.270270 7973 215.486487 ... 221 26 24 116 0 120 86 0 8 Malware
58592 Ransomware-Shade-f56687137caf9a67678cde91e4614... 37 14 9.945946 0 190.216216 1347 36.405405 7038 190.216216 ... 221 26 24 116 0 116 88 0 8 Malware
58593 Ransomware-Shade-faddeea111a25da4d0888f3044ae9... 38 15 9.842105 0 210.026316 1448 38.105263 7982 215.729730 ... 221 26 24 116 0 120 88 0 8 Malware
58594 Ransomware-Shade-f866c086af2e1d8ebaa6f2c863157... 37 15 10.243243 0 215.513513 1452 39.243243 7974 215.513513 ... 221 26 24 116 0 120 87 0 8 Malware
58595 Ransomware-Shade-955d9af38346c1755527bd196668e... 38 15 9.868421 0 213.026316 1487 39.131579 8095 213.026316 ... 221 26 24 116 0 120 86 0 8 Malware

58596 rows × 57 columns

In [30]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Convert categorical values to float values
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Class'] = label_encoder.fit_transform(df['Class'])
# Print the transformed DataFrame
print(df)
       Category  pslist.nproc  pslist.nppid  pslist.avg_threads  \
0             0            45            17           10.555556   
1             0            47            19           11.531915   
2             0            40            14           14.725000   
3             0            32            13           13.500000   
4             0            42            16           11.452381   
...         ...           ...           ...                 ...   
58591      9362            37            15           10.108108   
58592      9282            37            14            9.945946   
58593      9411            38            15            9.842105   
58594      9325            37            15           10.243243   
58595      9042            38            15            9.868421   

       pslist.nprocs64bit  pslist.avg_handlers  dlllist.ndlls  \
0                       0           202.844444           1694   
1                       0           242.234043           2074   
2                       0           288.225000           1932   
3                       0           264.281250           1445   
4                       0           281.333333           2067   
...                   ...                  ...            ...   
58591                   0           215.486487           1453   
58592                   0           190.216216           1347   
58593                   0           210.026316           1448   
58594                   0           215.513513           1452   
58595                   0           213.026316           1487   

       dlllist.avg_dlls_per_proc  handles.nhandles  \
0                      38.500000              9129   
1                      44.127660             11385   
2                      48.300000             11529   
3                      45.156250              8457   
4                      49.214286             11816   
...                          ...               ...   
58591                  39.270270              7973   
58592                  36.405405              7038   
58593                  38.105263              7982   
58594                  39.243243              7974   
58595                  39.131579              8095   

       handles.avg_handles_per_proc  ...  svcscan.kernel_drivers  \
0                        212.302326  ...                     221   
1                        242.234043  ...                     222   
2                        288.225000  ...                     222   
3                        264.281250  ...                     222   
4                        281.333333  ...                     222   
...                             ...  ...                     ...   
58591                    215.486487  ...                     221   
58592                    190.216216  ...                     221   
58593                    215.729730  ...                     221   
58594                    215.513513  ...                     221   
58595                    213.026316  ...                     221   

       svcscan.fs_drivers  svcscan.process_services  \
0                      26                        24   
1                      26                        24   
2                      26                        27   
3                      26                        27   
4                      26                        24   
...                   ...                       ...   
58591                  26                        24   
58592                  26                        24   
58593                  26                        24   
58594                  26                        24   
58595                  26                        24   

       svcscan.shared_process_services  svcscan.interactive_process_services  \
0                                  116                                     0   
1                                  118                                     0   
2                                  118                                     0   
3                                  118                                     0   
4                                  118                                     0   
...                                ...                                   ...   
58591                              116                                     0   
58592                              116                                     0   
58593                              116                                     0   
58594                              116                                     0   
58595                              116                                     0   

       svcscan.nactive  callbacks.ncallbacks  callbacks.nanonymous  \
0                  121                    87                     0   
1                  122                    87                     0   
2                  120                    88                     0   
3                  120                    88                     0   
4                  124                    87                     0   
...                ...                   ...                   ...   
58591              120                    86                     0   
58592              116                    88                     0   
58593              120                    88                     0   
58594              120                    87                     0   
58595              120                    86                     0   

       callbacks.ngeneric  Class  
0                       8      0  
1                       8      0  
2                       8      0  
3                       8      0  
4                       8      0  
...                   ...    ...  
58591                   8      1  
58592                   8      1  
58593                   8      1  
58594                   8      1  
58595                   8      1  

[58596 rows x 57 columns]
In [31]:

Out[31]:
Category pslist.nproc pslist.nppid pslist.avg_threads pslist.nprocs64bit pslist.avg_handlers dlllist.ndlls dlllist.avg_dlls_per_proc handles.nhandles handles.avg_handles_per_proc ... svcscan.kernel_drivers svcscan.fs_drivers svcscan.process_services svcscan.shared_process_services svcscan.interactive_process_services svcscan.nactive callbacks.ncallbacks callbacks.nanonymous callbacks.ngeneric Class
0 0 45 17 10.555556 0 202.844444 1694 38.500000 9129 212.302326 ... 221 26 24 116 0 121 87 0 8 0
1 0 47 19 11.531915 0 242.234043 2074 44.127660 11385 242.234043 ... 222 26 24 118 0 122 87 0 8 0
2 0 40 14 14.725000 0 288.225000 1932 48.300000 11529 288.225000 ... 222 26 27 118 0 120 88 0 8 0
3 0 32 13 13.500000 0 264.281250 1445 45.156250 8457 264.281250 ... 222 26 27 118 0 120 88 0 8 0
4 0 42 16 11.452381 0 281.333333 2067 49.214286 11816 281.333333 ... 222 26 24 118 0 124 87 0 8 0

5 rows × 57 columns

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load the dataset
df = pd.read_csv('Obfuscated-MalMem2022.csv')

df['Class'] = df['Class'].astype('category').cat.codes

# Diagnostic Analytics for Text Data (Word Clouds)
text_columns = ['Category']
for column in text_columns:
    if df[column].notnull().any():  # Check if column contains at least one non-null value
        plt.figure(figsize=(8, 6))
        wordcloud = WordCloud(background_color='white').generate(' '.join(df[column].astype(str)))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud - {column}')
        plt.axis('off')
        plt.show()
    else:
        print(f"No data available for word cloud - {column}")

# Diagnostic Analytics for Numerical Data (Correlation Plot)
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True)
plt.title('Correlation Heatmap')
plt.show()

# Diagnostic Analytics for Numerical Data (Scatterplot with Trendline)
numerical_columns = ['pslist.nproc', 'pslist.avg_threads', 'handles.nhandles']
for column in numerical_columns:
    plt.figure(figsize=(8, 6))
    sns.regplot(x=column, y='Class', data=df, scatter_kws={'alpha': 0.5})
    plt.title(f'Scatterplot with Trendline - {column}')
    plt.xlabel(column)
    plt.ylabel('Class')
    plt.show()
No description has been provided for this image
<ipython-input-32-1d053cf7870a>:26: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  correlation_matrix = df.corr()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

df['Category'] = df['Category'].astype('category').cat.codes

# Split the dataset into features (X) and target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Predict the target variable for the test set
y_pred = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report}")
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5790
           1       1.00      1.00      1.00      5930

    accuracy                           1.00     11720
   macro avg       1.00      1.00      1.00     11720
weighted avg       1.00      1.00      1.00     11720

In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Select only the numerical columns
numeric_cols = df.drop("Class", axis=1).select_dtypes(include=[float, int])

# Histogram of all columns
for col in numeric_cols.columns:
    if col != "ldrmodules.not_in_mem_avg":
        sns.histplot(df[col], kde=True)
        plt.title(f"Histogram of {col}")
        plt.xlabel(col)
        plt.ylabel("Frequency")
        plt.show()

# Pair plots of all columns
sns.pairplot(df, hue="Class", diag_kind="hist")
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-7f201c040420> in <cell line: 9>()
      9 for col in numeric_cols.columns:
     10     if col != "ldrmodules.not_in_mem_avg":
---> 11         sns.histplot(df[col], kde=True)
     12         plt.title(f"Histogram of {col}")
     13         plt.xlabel(col)

/usr/local/lib/python3.10/dist-packages/seaborn/distributions.py in histplot(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs)
   1430     if p.univariate:
   1431 
-> 1432         p.plot_univariate_histogram(
   1433             multiple=multiple,
   1434             element=element,

/usr/local/lib/python3.10/dist-packages/seaborn/distributions.py in plot_univariate_histogram(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws)
    573 
    574                 plot_func = ax.bar if self.data_variable == "x" else ax.barh
--> 575                 artists = plot_func(
    576                     hist["edges"],
    577                     hist["heights"] - bottom,

/usr/local/lib/python3.10/dist-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
   1440     def inner(ax, *args, data=None, **kwargs):
   1441         if data is None:
-> 1442             return func(ax, *map(sanitize_sequence, args), **kwargs)
   1443 
   1444         bound = new_sig.bind(ax, *args, **kwargs)

/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_axes.py in bar(self, x, height, width, bottom, align, **kwargs)
   2492             else:  # horizontal
   2493                 r.sticky_edges.x.append(l)
-> 2494             self.add_patch(r)
   2495             patches.append(r)
   2496 

/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in add_patch(self, p)
   2377         if p.get_clip_path() is None:
   2378             p.set_clip_path(self.patch)
-> 2379         self._update_patch_limits(p)
   2380         self._children.append(p)
   2381         p._remove_method = self._children.remove

/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in _update_patch_limits(self, patch)
   2399         # Loop through each segment to get extrema for Bezier curve sections
   2400         vertices = []
-> 2401         for curve, code in p.iter_bezier(simplify=False):
   2402             # Get distance along the curve of any extrema
   2403             _, dzeros = curve.axis_aligned_extrema()

/usr/local/lib/python3.10/dist-packages/matplotlib/path.py in iter_bezier(self, **kwargs)
    447             if code == Path.MOVETO:  # a point is like "CURVE1"
    448                 first_vert = verts
--> 449                 yield BezierSegment(np.array([first_vert])), code
    450             elif code == Path.LINETO:  # "CURVE2"
    451                 yield BezierSegment(np.array([prev_vert, verts])), code

/usr/local/lib/python3.10/dist-packages/matplotlib/bezier.py in __init__(self, control_points)
    192         self._cpoints = np.asarray(control_points)
    193         self._N, self._d = self._cpoints.shape
--> 194         self._orders = np.arange(self._N)
    195         coeff = [math.factorial(self._N - 1)
    196                  // (math.factorial(i) * math.factorial(self._N - 1 - i))

KeyboardInterrupt: 
Error in callback <function _draw_all_if_interactive at 0x7fb2e3531000> (for post_execute):
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.py in _draw_all_if_interactive()
    118 def _draw_all_if_interactive():
    119     if matplotlib.is_interactive():
--> 120         draw_all()
    121 
    122 

/usr/local/lib/python3.10/dist-packages/matplotlib/_pylab_helpers.py in draw_all(cls, force)
    130         for manager in cls.get_all_fig_managers():
    131             if force or manager.canvas.figure.stale:
--> 132                 manager.canvas.draw_idle()
    133 
    134 

/usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in draw_idle(self, *args, **kwargs)
   2080         if not self._is_idle_drawing:
   2081             with self._idle_draw_cntx():
-> 2082                 self.draw(*args, **kwargs)
   2083 
   2084     @property

/usr/local/lib/python3.10/dist-packages/matplotlib/backends/backend_agg.py in draw(self)
    398              (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar
    399               else nullcontext()):
--> 400             self.figure.draw(self.renderer)
    401             # A GUI class may be need to update a window using this draw, so
    402             # don't forget to call the superclass.

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     93     @wraps(draw)
     94     def draw_wrapper(artist, renderer, *args, **kwargs):
---> 95         result = draw(artist, renderer, *args, **kwargs)
     96         if renderer._rasterizing:
     97             renderer.stop_rasterizing()

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/figure.py in draw(self, renderer)
   3138 
   3139             self.patch.draw(renderer)
-> 3140             mimage._draw_list_compositing_images(
   3141                 renderer, self, artists, self.suppressComposite)
   3142 

/usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    129     if not_composite or not has_images:
    130         for a in artists:
--> 131             a.draw(renderer)
    132     else:
    133         # Composite any adjacent images together

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in draw(self, renderer)
   3062             _draw_rasterized(self.figure, artists_rasterized, renderer)
   3063 
-> 3064         mimage._draw_list_compositing_images(
   3065             renderer, self, artists, self.figure.suppressComposite)
   3066 

/usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    129     if not_composite or not has_images:
    130         for a in artists:
--> 131             a.draw(renderer)
    132     else:
    133         # Composite any adjacent images together

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in draw(self, renderer)
    589         tpath = transform.transform_path_non_affine(path)
    590         affine = transform.get_affine()
--> 591         self._draw_paths_with_artist_properties(
    592             renderer,
    593             [(tpath, affine,

/usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in _draw_paths_with_artist_properties(self, renderer, draw_path_args_list)
    543 
    544         renderer.open_group('patch', self.get_gid())
--> 545         gc = renderer.new_gc()
    546 
    547         gc.set_foreground(self._edgecolor, isRGBA=True)

/usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in new_gc(self)
    683     def new_gc(self):
    684         """Return an instance of a `.GraphicsContextBase`."""
--> 685         return GraphicsContextBase()
    686 
    687     def points_to_pixels(self, points):

/usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in __init__(self)
    762         self._forced_alpha = False  # if True, _alpha overrides A from RGBA
    763         self._antialiased = 1  # use 0, 1 not True, False for extension code
--> 764         self._capstyle = CapStyle('butt')
    765         self._cliprect = None
    766         self._clippath = None

/usr/lib/python3.10/enum.py in __call__(cls, value, names, module, qualname, type, start)
    357         return True
    358 
--> 359     def __call__(cls, value, names=None, *, module=None, qualname=None, type=None, start=1):
    360         """
    361         Either returns an existing member, or creates a new enum class.

KeyboardInterrupt: 
Error in callback <function flush_figures at 0x7fb2e3530280> (for post_execute):
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/matplotlib_inline/backend_inline.py in flush_figures()
    124             # ignore the tracking, just draw and close all figures
    125             try:
--> 126                 return show(True)
    127             except Exception as e:
    128                 # safely show traceback if in IPython, else raise

/usr/local/lib/python3.10/dist-packages/matplotlib_inline/backend_inline.py in show(close, block)
     88     try:
     89         for figure_manager in Gcf.get_all_fig_managers():
---> 90             display(
     91                 figure_manager.canvas.figure,
     92                 metadata=_fetch_figure_metadata(figure_manager.canvas.figure)

/usr/local/lib/python3.10/dist-packages/IPython/core/display.py in display(include, exclude, metadata, transient, display_id, *objs, **kwargs)
    318             publish_display_data(data=obj, metadata=metadata, **kwargs)
    319         else:
--> 320             format_dict, md_dict = format(obj, include=include, exclude=exclude)
    321             if not format_dict:
    322                 # nothing to display (e.g. _ipython_display_ took over)

/usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in format(self, obj, include, exclude)
    178             md = None
    179             try:
--> 180                 data = formatter(obj)
    181             except:
    182                 # FIXME: log the exception

<decorator-gen-2> in __call__(self, obj)

/usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in catch_format_error(method, self, *args, **kwargs)
    222     """show traceback on failed format call"""
    223     try:
--> 224         r = method(self, *args, **kwargs)
    225     except NotImplementedError:
    226         # don't warn on NotImplementedErrors

/usr/local/lib/python3.10/dist-packages/IPython/core/formatters.py in __call__(self, obj)
    339                 pass
    340             else:
--> 341                 return printer(obj)
    342             # Finally look for special method names
    343             method = get_real_method(obj, self.print_method)

/usr/local/lib/python3.10/dist-packages/IPython/core/pylabtools.py in print_figure(fig, fmt, bbox_inches, base64, **kwargs)
    149         FigureCanvasBase(fig)
    150 
--> 151     fig.canvas.print_figure(bytes_io, **kw)
    152     data = bytes_io.getvalue()
    153     if fmt == 'svg':

/usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs)
   2340                 )
   2341                 with getattr(renderer, "_draw_disabled", nullcontext)():
-> 2342                     self.figure.draw(renderer)
   2343 
   2344             if bbox_inches:

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer, *args, **kwargs)
     93     @wraps(draw)
     94     def draw_wrapper(artist, renderer, *args, **kwargs):
---> 95         result = draw(artist, renderer, *args, **kwargs)
     96         if renderer._rasterizing:
     97             renderer.stop_rasterizing()

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/figure.py in draw(self, renderer)
   3138 
   3139             self.patch.draw(renderer)
-> 3140             mimage._draw_list_compositing_images(
   3141                 renderer, self, artists, self.suppressComposite)
   3142 

/usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    129     if not_composite or not has_images:
    130         for a in artists:
--> 131             a.draw(renderer)
    132     else:
    133         # Composite any adjacent images together

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py in draw(self, renderer)
   3062             _draw_rasterized(self.figure, artists_rasterized, renderer)
   3063 
-> 3064         mimage._draw_list_compositing_images(
   3065             renderer, self, artists, self.figure.suppressComposite)
   3066 

/usr/local/lib/python3.10/dist-packages/matplotlib/image.py in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    129     if not_composite or not has_images:
    130         for a in artists:
--> 131             a.draw(renderer)
    132     else:
    133         # Composite any adjacent images together

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in draw_wrapper(artist, renderer)
     70                 renderer.start_filter()
     71 
---> 72             return draw(artist, renderer)
     73         finally:
     74             if artist.get_agg_filter() is not None:

/usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in draw(self, renderer)
    589         tpath = transform.transform_path_non_affine(path)
    590         affine = transform.get_affine()
--> 591         self._draw_paths_with_artist_properties(
    592             renderer,
    593             [(tpath, affine,

/usr/local/lib/python3.10/dist-packages/matplotlib/patches.py in _draw_paths_with_artist_properties(self, renderer, draw_path_args_list)
    556 
    557         gc.set_antialiased(self._antialiased)
--> 558         self._set_gc_clip(gc)
    559         gc.set_url(self._url)
    560         gc.set_snap(self.get_snap())

/usr/local/lib/python3.10/dist-packages/matplotlib/artist.py in _set_gc_clip(self, gc)
    932             if self.clipbox is not None:
    933                 gc.set_clip_rectangle(self.clipbox)
--> 934             gc.set_clip_path(self._clippath)
    935         else:
    936             gc.set_clip_rectangle(None)

/usr/local/lib/python3.10/dist-packages/matplotlib/backend_bases.py in set_clip_path(self, path)
    928     def set_clip_path(self, path):
    929         """Set the clip path to a `.TransformedPath` or None."""
--> 930         _api.check_isinstance((transforms.TransformedPath, None), path=path)
    931         self._clippath = path
    932 

/usr/local/lib/python3.10/dist-packages/matplotlib/_api/__init__.py in check_isinstance(_types, **kwargs)
     85                 else f"{tp.__module__}.{tp.__qualname__}")
     86 
---> 87     for k, v in kwargs.items():
     88         if not isinstance(v, types):
     89             names = [*map(type_name, types)]

KeyboardInterrupt: 
In [34]:
# Visualize the distribution of each feature
sns.histplot(df["pslist.nproc"])
plt.show()
No description has been provided for this image
In [35]:
sns.histplot(df["callbacks.ngeneric"])
plt.show()
No description has been provided for this image
In [36]:
sns.histplot(df["pslist.nppid"])
plt.show()
No description has been provided for this image
In [37]:
sns.histplot(df["Class"])
plt.show()
No description has been provided for this image
In [38]:
sns.histplot(df["pslist.avg_threads"])
plt.show()
No description has been provided for this image
In [39]:
sns.histplot(df["dlllist.ndlls"])
plt.show()
No description has been provided for this image
In [40]:
sns.histplot(df["svcscan.shared_process_services"])
plt.show()
No description has been provided for this image

Top 5 Features Shown¶

In [41]:
# Split the dataset into features (X) and target variable (y)
X = df.drop("Class", axis=1)
y = df["Class"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Get feature importances
importances = rf_classifier.feature_importances_

# Create a DataFrame to store feature importances
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": importances})
feature_importances.sort_values(by="Importance", ascending=False, inplace=True)

# Plot the top five features' importance
sns.barplot(x="Importance", y="Feature", data=feature_importances.head(5))
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top Five Features Importance")
plt.show()
No description has been provided for this image
In [42]:
# Split the dataset into features (X) and target variable (y)
X = df.drop("Class", axis=1)
y = df["Class"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-Score: 1.0
In [ ]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.metrics import classification_report

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Calculate feature importances
importances = rf_classifier.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_features = X.columns[sorted_indices]

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(sorted_features[:5], importances[sorted_indices][:5])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Top 5 Feature Importances")
plt.xticks(rotation=45)
plt.show()

# Step 5: Classifier Building
# Train a random forest classifier on the entire dataset
rf_classifier_final = RandomForestClassifier()
rf_classifier_final.fit(X, y)

# Evaluate the classifier's performance on the testing data
y_pred = rf_classifier_final.predict(X_test)
print(classification_report(y_test, y_pred))
No description has been provided for this image
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5790
           1       1.00      1.00      1.00      5930

    accuracy                           1.00     11720
   macro avg       1.00      1.00      1.00     11720
weighted avg       1.00      1.00      1.00     11720

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 3: Diagnostic Analysis
correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Step 4: Feature Importance
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

importances = rf_classifier.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
sorted_features = X.columns[sorted_indices]

plt.figure(figsize=(10, 6))
plt.bar(sorted_features[:5], importances[sorted_indices][:5])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Top 5 Feature Importances")
plt.xticks(rotation=45)
plt.show()

# Step 5: Classifier Building
rf_classifier_final = RandomForestClassifier()
rf_classifier_final.fit(X, y)

y_pred = rf_classifier_final.predict(X_test)
print(classification_report(y_test, y_pred))
No description has been provided for this image
No description has been provided for this image
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5790
           1       1.00      1.00      1.00      5930

    accuracy                           1.00     11720
   macro avg       1.00      1.00      1.00     11720
weighted avg       1.00      1.00      1.00     11720

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Compute correlation matrix
correlation_matrix = df.corr()

# Plot correlation matrix as a heatmap
plt.figure(figsize=(100, 80))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title("Correlation Heatmap")
plt.show()
Output hidden; open in https://colab.research.google.com to view.

New section¶

In [44]:
!jupyter nbconvert --to html BigData.ipynb
[NbConvertApp] Converting notebook BigData.ipynb to html
[NbConvertApp] Writing 8951276 bytes to BigData.html
In [ ]: